####################
## Warm-up 1: Create a vector called "test.score" and calculate the mean and standard deviation.
#####################
## Create a vector, called test.score, with the values
test.score <- c(80, 90, 70, 30, 40, 80, 75)

## Use "mean()" function to get the mean
mean(test.score)

## Use "sd()" function to get the standard deviation (SD)
sd(test.score)


####################
## Warm-up 2: t-test
## Create another vector called "test.score2" for the test scores of another group, and perform a t-test.
#####################
## Create another vector for another group
test.score2 <- c(55, 80, 60, 90, 60, 90, 95)

## Use "t.test()" function to perform a t-test
t.test(test.score, test.score2)

## Getting help 
?t.test

####################
## Data analysis example 1: Eye Bank (slide 9)
## We will use Eye Bank data to learn to 1) read the data, 2) retrieve a variable a variable (column), 
## and 3) get basic statistics, and 4) perform t-test and Chi-squared test.
##
## First, let's read the data into "dat".
#####################

## Set the working directory
setwd('/Users/moon/EinsteinMed Dropbox/Jee Young Moon/class/R workshop/2024-Melissa')

## Read the data
library(readxl)
dat <- read_excel('EyeBank.xlsx', sheet=1)
dim(dat)
head(dat)
tail(dat)
str(dat)
summary(dat)

## you can access the cellcount variable by
dat$cellcount
## how many missing in the cellcount variable?
is.na(dat$cellcount)
sum(is.na(dat$cellcount))


# You can access the age variable in the data by
dat$age
summary(dat$age)

## Create an age group (>=76, <76)
x <- dat$age >=76

## age >= 76 to call 'old', and age < 76 to call young
dat$age.group <- ifelse(dat$age>=76, 'old', 'young')

##########################
## I. Cell count analysis
## We will analyze the cell count (continuous) by age group. 
## (a) We will summarize the cell count by mean and SD according to the age group.
## (b) T-test
## (c) Linear regression.
##########################
## a. Mean and SD of cell count in overall
mean(dat$cellcount)
mean(dat$cellcount, na.rm=T)
sd(dat$cellcount, na.rm=T)

##  Mean and SD of cell count by age group
## Make a subset of the data of old group
## You have to use double equal signs (==) when you are asking if the value is equal to something. 
## This is because one equal sign ("=") is used to assign the value, similar to "<-"
dat.old <- subset(dat, age.group == 'old')

mean(dat.old$cellcount, na.rm=T)
sd(dat.old$cellcount, na.rm=T)


## Equivalently, you can use subset() and mean() functions together in one R command.
mean(subset(dat, age.group == 'old')$cellcount, na.rm=T)
mean(subset(dat, age.group == 'old')$cellcount, na.rm=T)


###################
## Small practice. 
###################
## Get mean and SD of cell count for young group


###############
## T-test
###############
## b. t-test on Cell count by age group
## As was done in Warm-up II, you can use t.test() with two groups' cell counts as input.
dat.young <- subset(dat, age.group == 'young')

t.test(dat.old$cellcount, dat.young$cellcount)


## A simpler way for t.test using a formula! 
## See slide 13.
t.test(cellcount ~ age.group, dat)



## c. Linear regression 
## Further, you can fit a linear regression using a formula. 
## summary() provides more information on the fitted regression (fit1) such as standard error and p-value. 
lm(cellcount ~ age.group, dat)
fit1 <- lm(cellcount ~ age.group, dat)
fit1
summary(fit1)

## factor
# In R, factor() is often used on binary or categorical variables to give an order on their categories.
# The first level of the category is used as a reference group.

dat$age.group <- factor(dat$age.group, levels=c('young', 'old'))

## Now, the reference group is the 'young' group. 
fit2 <- lm(cellcount ~ age.group, dat)
summary(fit2)

## You can include more predictors in the linear regression.
fit3 <- lm(cellcount ~ age.group + sex+diabetes, dat)
summary(fit3)

##################
## II. Transplant acceptance
## We will analyze the transplant acceptance (binary) by age group. 
## (a) We will summarize transplant acceptance by count and percentage according to  age group.
## (b) Chi-square test, Fisher's exact test.
## (c) Logistic regression.
####################
## a. Get the count using table() function

table(dat$accepted.transplant)

table(dat$diabetes)

# If you want to know the number of NA (missing) values in diabetes
table(dat$diabetes, useNA='ifany')


## A cross-tabulation between accepted.transplant and age.group
table(dat[, c( 'accepted.transplant','age.group')])

tab <- table(dat[, c( 'accepted.transplant','age.group')])
colSums(tab)
rowSums(tab)

## a little advanced.. to get %
tab/rep(colSums(tab), each=2)

## b. Chi-squared test 
chisq.test(tab)
fisher.test(tab) 

## If you have cross-tabulated counts, you can create a matrix in this way. 
## By default, the first column is filled first, then second column, etc.
mat <- matrix(c(827, 1907, 102, 106), ncol=2)
mat
## If you want to fill the first row first, you can use byrow=T
matrix(c(827, 1907, 102, 106), ncol=2, byrow=T)

chisq.test(mat)


## c. logistic regression
## Now, let's fit a logistic regression as accepted.transplant is a binary variable.
## A logistic regression can further adjust for other covariates.
glm(accepted.transplant ~ age.group, dat, family=binomial('logit'))
 
dat$accepted.transplant <- factor(dat$accepted.transplant, levels=c('No', 'Yes'))

fit.transplant <- glm(accepted.transplant ~ age.group, dat, family=binomial('logit'))
summary(fit.transplant)

fit.transplant3 <- glm(accepted.transplant ~ age.group+sex+ diabetes, dat, family=binomial('logit'))



#######################
## Practice
## Fill the table in slide 14 by diabetes
#######################



#############
## table1 and tableone example
## Now, we learned how to get basic stats.
## We want to expedite this process over multiple variables.
## table1() and CreateTableOne() functions give good Table 1.
## Both functions provide summary statistics.
## table1() does not produce p-values but shows how many missing values are; CreateTableone() produces p-values. 
############
install.packages('table1')

## Create a table 1
library(table1)

table1(~age+sex+cellcount+accepted.transplant+diabetes | age.group, dat)

tab <- table1(~age+sex+cellcount+accepted.transplant+diabetes | age.group, dat)
write.table(tab, "table1.csv", col.names=T, row.names=F, sep=',')

## exclude missing values 
tab.nomissing <- table1(~age+sex+cellcount+accepted.transplant+diabetes | age.group, dat, render.missing=NULL)
write.table(tab.nomissing, "table1-nomissing.csv", col.names=T, row.names=F, sep=',')



## Another way to create a table 1 with p-values using CreateTableOne() in tableone package
install.packages('tableone')
library(tableone)

## Create a master table 1 for vars according to the "strata" 
## The function calculates all following stats.
## Continuous variables: mean (SD),  t-test, median (IQR), Wilcoxon test
## Categorical variables: count (%),  Chi-square test, Fisher's exact test
tab2 <- CreateTableOne(vars=c('age', 'sex', 'cellcount', 'accepted.transplant', 'diabetes'), strata='age.group', data=dat,      
    argsNormal=list(var.equal=FALSE))


## To print out the table. You can choose which statistical test and statistics to be used.
## for continuous variables, you want to present (a) mean (SD) and p-value by t-test [default], or (b) median (IQR) and Wilcoxon test
## for categorical variables, you want to present (a) count (%) and p-value by Chi-square test [default], (b) count (%) and p-value by Fisher's exact test
print(tab2, nonnormal=c('age', 'cellcount'), exact=c('sex','accepted.transplant', 'diabetes'), quote=T)


## Median (IQR) and Wilcoxon test for continuous variables; Fisher's exact test for categorical variables.
write.csv(print(tab2, nonnormal=c('age', 'cellcount'), exact=c('sex','accepted.transplant', 'diabetes'), quote=T), file='table1-tableone-nonparametric.csv')

## Mean (SD) and t-test for continuous variables; Fisher's exact test for categorical variables.
write.csv(print(tab2, exact=c('sex','accepted.transplant', 'diabetes'), quote=T), file='table1-tableone-parametric.csv')


#################
## Practice
## Create Table 1 by diabetes using table1() or CreateTableOne()
##################







####################
## Plots: slide 15
## R is very good at making plots.
## We will show how make basic plots in R. 
####################
## 1-variable plots
## Histogram
hist(dat$cellcount)
hist(dat$age, xlab='Age (years)', main='Histogram')

## barplot
barplot(table(dat$age.group))
barplot(table(dat$age.group), names=c('Age < 76', 'Age>=76'), ylab='Count', main='Barplot')


## between two variable plots: slide 16
## Scatter plot
plot(dat$age, dat$cellcount)
plot(cellcount~age, dat)

plot(cellcount~age, dat, xlab='Age (years)', ylab='Cell count (count/mm2)', main='Cell count with age')
plot(cellcount~age, dat, xlab='Age (years)', ylab='Cell count (count/mm2)', main='Cell count with age', xlim=c(40,60))

fit <- lm(cellcount~age, dat)
plot(cellcount~age, dat, xlab='Age (years)', ylab='Cell count (count/mm2)', main='Cell count with age')
abline(fit, col='blue') ## add a regression line

## save into pdf
pdf('cellcount-age.pdf', width=5, height=5)
plot(cellcount~age, dat, xlab='Age (years)', ylab='Cell count (count/mm2)', main='Cell count with age')
abline(fit, col='blue') ## add a regression line
dev.off()


## Box plot
plot(cellcount ~ factor(age.group), dat)

plot(cellcount ~ age.group, dat, names=c('Age < 76', 'Age>=76'), ylab='Cell count (count/mm2)', xlab = 'Age group')





###############
## Export data (slide 18)
## After working with your data (cleaning), you might want to save the data (slide 18)
## You can save the cleaned data into .csv or .txt files.
## Another option is to save the R objects into xxx.RData file. This will preserve the special attributes on the data such as factor. 
## load() function can read xxx.RData file in R.
################
write.csv(dat, file='eyebank-updated-10082024.csv')
write.table(dat, file='eyebank-updated-10082024.txt', sep='\t')


save(dat, file='eyebank-10082024.RData')
load('eyebank-10082024.RData', verbose=T)


################
## Quit R
##################
q()


 








